# 数据和词向量下载
# ! wget https://nlp.stanford.edu/data/glove.6B.zip
# !unzip  ./glove.6B.zip
# ! wget http://www.cs.cmu.edu/afs/cs.cmu.edu/project/theo-20/www/data/news20.tar.gz
# !tar -xvzf ./news20.tar.gz
from __future__ import print_function 
import os 
import sys 
import numpy as np 
from keras.preprocessing.text     import Tokenizer 
from keras.preprocessing.sequence import pad_sequences 
from keras.utils  import to_categorical 
from keras.layers import Dense,Input,GlobalMaxPooling1D
from keras.layers import Conv1D,MaxPooling1D,Embedding
from keras.models import Model 
from keras.initializers import Constant 

BASE_DIR = './'
#glove模型路径
GLOVE_DIR     = os.path.join(BASE_DIR,'glove.6B')
#文本语料路径
TEXT_DATA_DIR = os.path.join(BASE_DIR,'20_newsgroup')

max_sequence_length = 1000
max_words_num       = 20000 
embedding_dim      = 100
validation_split    = 0.2

#1.准备glove词向量和它们对应的字典映射
print('Indexing word vectors.')

embeddings_index = {}
with open(os.path.join(GLOVE_DIR,'glove.6B.100d.txt'),encoding = 'utf-8') as f:
	for line in f:
		word,coefs = line.split(maxsplit = 1)
		#maxsplit = 1,只对第一个出现的空格进行分割
		#对所有的情况都分割时，maxsplit = -1
		coefs = np.fromstring(coefs,'f',sep = ' ')
		#fromstring：将字符串按分隔符解码成矩阵
		embeddings_index[word] = coefs

print('Found %s word vectors.' % len(embeddings_index))
"""
查看词向量
print('was')
print(embeddings_index['was'])
运行结果见figure1
"""
#2.准备训练文本和标签 
print('Processing text dataset')

texts        = []
labels       = []
#dictionary mapping label name to numeric id 
labels_index = {}

for name in sorted(os.listdir(TEXT_DATA_DIR)):
	path = os.path.join(TEXT_DATA_DIR,name)
	if os.path.isdir(path):
		label_id = len(labels_index)
		labels_index[name] = label_id 
		for fname in sorted(os.listdir(path)):
			if fname.isdigit():
				fpath = os.path.join(path,fname)
				args  = {} if sys.version_info < (3,) else {'encoding':'latin-1'}
				#print('args:',args)
				#args: {'encoding': 'latin-1'}
				with open(fpath,**args) as f:
					t = f.read()
					i = t.find('\n\n') #skip header 
					if 0 < i:
						t = t[i:]
					texts.append(t)
				labels.append(label_id)
				#这个label_id的生成方式比较有意思

print('Found %s texts.' % len(texts))

#3.将文本向量化为 2D integer tensor
tokenizer = Tokenizer(num_words = max_words_num)
tokenizer.fit_on_texts(texts)
sequences  = tokenizer.texts_to_sequences(texts)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

data   = pad_sequences(sequences,maxlen = max_sequence_length)
labels = to_categorical(np.asarray(labels))
print('Shape of data tensor:',data.shape)
print('Shape of label tensor:',labels.shape)  

#4.分割训练集和验证集
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data    = data[indices]
labels  = labels[indices]
num_validation_samples = int(validation_split * data.shape[0])

x_train = data[:-num_validation_samples]
y_train = labels[:-num_validation_samples]
x_val   = data[-num_validation_samples:]
y_val   = labels[-num_validation_samples:]


#5.生成词嵌入矩阵
print('Preparing embedding matrix.')

num_words = min(max_words_num,len(word_index) + 1)
embedding_matrix = np.zeros((num_words,embedding_dim))
for word,i in word_index.items():
	if i >= max_words_num:
		continue
	embedding_vector = embeddings_index.get(word)
	if embedding_vector is not None:
		# words not found in embedding index will be all-zeros.
		#从预训练模型的词向量到语料库的词向量映射
		embedding_matrix[i] = embedding_vector

#load pre-trained word embeddings into an Embedding layer
#note that we set trainable = False so as to keep the embeddings fixed
embedding_layer = Embedding(
						num_words,
						embedding_dim,
						embeddings_initializer = Constant(embedding_matrix),
						input_length = max_sequence_length,
						trainable = False
					)

print('Training model.')

#6.搭建CNN模型，开始训练
sequence_input     = Input(shape = (max_sequence_length,),dtype = 'int32')
embedded_sequences = embedding_layer(sequence_input)
x = Conv1D(128,5,activation = 'relu')(embedded_sequences)
x = MaxPooling1D(5)(x)
x = Conv1D(128,5,activation = 'relu')(x)
x = MaxPooling1D(5)(x)
x = Conv1D(128,5,activation = 'relu')(x)
x = GlobalMaxPooling1D()(x)
x = Dense(128,activation = 'relu')(x)
preds = Dense(len(labels_index),activation = 'softmax')(x)

model = Model(sequence_input,preds)
model.compile(
		loss = 'categorical_crossentropy',
		optimizer = 'rmsprop',
		metrics = ['acc']
	)

model.fit(
	x_train,y_train,
	batch_size = 128,
	epochs = 5,
	validation_data = (x_val,y_val)
	)